cd PATH TO DATA DIRECTORY

local csvfile "PATH TO HOURLY DATA CSV" // name of csv file to be ingested

*******************************************************************************
*Step 0: Generate NOAA_hourly_modified_sorted.dta from NOAA_hourly.dta by running the following code:
	import delimited "`csvfile'", varnames(1)
	gen double GMTdatetime = clock( date_gmt + hour_gmt , "YMD hm")
	drop date_gmt hour_gmt
	format GMTdatetime %tc
	drop if mi(GMTdatetime)
	drop if year(dofc(GMTdatetime))<1990
	drop if zip_code == 11111
	sort zip_code GMTdatetime
	replace zip_code = 02747 if zip_code == 27470
	save NOAA_hourly_modified_sorted.dta, replace
*/
	
/******************************************************************************
	Generate DST.dta
*******************************************************************************/
* this code generates a variable DSTday marking days when DST is in effect
tempfile calendar
clear
set obs `=(date("CURRENT DATE","YMD")-date("1989-12-31","YMD"))' // Update this line with current date
gen date = _n + date("1989-12-31","YMD")
format date %td
gen month = month(date)
gen dow = dow(date)
gen year = year(date)
bys year month dow (date): gen order = _n
bys year month dow (date): gen last  = _n == _N
gen DSTday = dow == 0 & ((year < 2007 & month == 4 & order == 1) | (year >= 2007 & month == 3 & order == 2)) // see 15 USC 260a(a) as amended by PL 109-58: 1st Sunday of April through 2006, 2nd Sunday of March thereafter
bys year (date): replace DSTday = 1 if DSTday[_n-1] == 1 // this sets DST to 1 for all dates following the one from the preceding line
replace DSTday = 0  if dow == 0 & ((year < 2007 & month == 10 & last == 1) | (year >= 2007 & month == 11 & order == 1)) // see 15 USC 260a(a) as amended by PL 109-58: 1st Sunday of April through 2006, 2nd Sunday of March thereafter
bys year (date): replace DSTday = 0 if DSTday[_n-1] == 0 & month > 6 // see above -- restriction to month>6 necessary or else 0s in Jan-March/April would turn all subsequent entries into 1s
drop month dow year order last
compress
save DST.dta, replace

/*******************************************************************************
1.
Download the zip code and time zone data from https://boutell.com/zipcodes/
Manually add missing zip codes fromo NOAA_hourly_modified_sorted.dta
Merge with zip_code_GMTdatetime_modified_sorted.dta
*******************************************************************************/
*copy "https://boutell.com/zipcodes/zipcode.zip" "zipcode.zip"
*unzipfile zipcode.zip, replace
import delimited zipcode.csv, clear
drop city state latitude longitude
rename (zip timezone) (zip_code time_zone)
assert zip_code != .
input
* ELOY INS DETENTION CENTER
85131 -7 0
* ELOY INS DETENTION CENTER
85132 -7 0
* San Diego: Federal Building
92188 -8 1
* Guam
96910 10 0
* Guam
96913 10 0
* Northern Mariana Islands
96950 10 0
end
sort zip_code
save zip_code.dta, replace

/*******************************************************************************
2.
Get the full set of zip_code and GMTdatetime data from NOAA_hourly_modified_sorted.dta
Save the unique sorted zip codes as zip_code_modified_sorted.dta
*******************************************************************************/
use zip_code GMTdatetime using NOAA_hourly_modified_sorted.dta, clear
bysort zip_code GMTdatetime: keep if _n == 1
merge m:1 zip_code using zip_code.dta, sorted keep(3) assert(2 3) nogenerate

/*******************************************************************************
3. Calculate localdatetime
*******************************************************************************/

gen double intermediate_localdatetime = GMTdatetime + time_zone * 60 * 60 * 1000
assert intermediate_localdatetime != .
drop time_zone
format intermediate_localdatetime %tc

* Create an intermediate local date (without time) as key for merging
gen double date = dofc(intermediate_localdatetime)
format date %td

/*
This step will generate _merge == 1 (1,311 obs.)
This is due to rounding of time back to before 1990.
This step will also generate _merge == 2 (5 obs.)
This is due to date not exisiting in NOAA_hourly_modified_sorted.dta.
*/

merge m:1 date using DST.dta
assert date < date("19900101","YMD") if _merge == 1
drop if _merge < 3
drop date _merge

gen double localdatetime = intermediate_localdatetime + dst * DSTday * 60 * 60 * 1000
drop dst intermediate_localdatetime DST
format localdatetime %tc
sort zip_code GMTdatetime
save zip_code_modified_sorted.dta, replace

/*******************************************************************************
4. Merge with NOAA_hourly.dta
*******************************************************************************/

merge 1:m zip_code GMTdatetime using NOAA_hourly_modified_sorted.dta, sorted

/*
This step will generate _merge == 2 (4,102 obs.)
Those are all dates at the beginning of 1990, and do not have a corresponding 
localdatetime due to rounding before 1990.
*/
assert dofc(GMTdatetime) < date("19900102","YMD") if _merge == 2
drop if _merge < 3
drop _merge
destring, ignore("NaNANe") replace
save NOAA.dta, replace

log close

/*******************************************************************************
5. Summarizing variables after cleaning
*******************************************************************************/

log using ./noaa_epa_summary

// post cleaning summary (cleaning done in R during data acquisition)

summarize temp_noaa dew_noaa pressure wind_speed_noaa prec_noaa pm25 pm25_local carbon_monoxide ozone, detail

// saving log file as pdf

translate ./noaa_epa_summary.pdf

log close
